home *** CD-ROM | disk | FTP | other *** search
-
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- #ifndef lint
- static char *RCSid = "$Header: /y/src/wais/wais-8-b5/ir/RCS/irbuild.c,v 1.47.1.1 1992/07/11 01:01:21 curtisg Exp curtisg $";
- #endif
-
- /*
- * Building an index with a Unix shell interface.
- *
- * -brewster 6/90
- */
-
- /* Change log:
- * added -stdio option from jik@athena.mit.edu
- * $Log: irbuild.c,v $
- *
- * M000, 28-May-93, hess
- * mods for ERG database files.
- * same as mail type, but bails out after ~200 lines, avoids large, large
- * indexes..., since many times the "From " portion of the file is indented
- * our mail threads look like one large piece of mail, which is ok, they
- * are intended to be read as a whole. But we still need to make sure
- * they are'nt broken up, so never let the seperator routine return true
- *
- * Revision 1.47.1.1 1992/07/11 01:01:21 curtisg
- * Changes for SCO UNIX
- *
- * Revision 1.47 92/05/10 14:48:17 jonathan
- * Updated for release.
- *
- * Revision 1.46 92/05/08 10:03:17 jonathan
- * Adjusted memory paramters. It's closer...
- *
- * Revision 1.45 92/05/06 17:26:46 jonathan
- * Added switch for indexing contents, new user-specified type name, new type:
- * filename, which only puts the name of the file in the header.
- *
- * Revision 1.44 92/04/25 21:14:35 brewster
- * added ziff
- *
- * Revision 1.43 92/04/22 15:29:13 jonathan
- * Added jargon to usage message.
- *
- * Revision 1.42 92/04/01 17:08:50 jonathan
- * Added FTP type.
- *
- * Revision 1.41 92/03/25 18:49:39 jonathan
- * Added log_level and log_file arguments.
- *
- * Revision 1.40 92/03/22 18:38:14 brewster
- * added objective C filter
- *
- * Revision 1.39 92/03/20 11:02:44 jonathan
- * Added code to handle switches for word_pairs and word_postition info.
- *
- * Revision 1.38 92/03/17 07:34:32 jonathan
- * Fixed spacing in usage message.
- *
- * Revision 1.37 92/03/10 10:42:51 morris
- * fixed small bug in command line argument handleing. doesn't die if there
- * are no args.
- *
- * Revision 1.36 92/03/05 07:05:32 shen
- * add cm grow percent and textsize to command line and init search engine
- *
- * Revision 1.35 92/03/04 16:34:09 jonathan
- * Set wais_pid from getpid().
- *
- * Revision 1.34 92/02/20 09:49:37 jonathan
- * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl.
- *
- * Revision 1.33 92/02/17 14:21:08 jonathan
- * Added switch to disable creation of catalog (-nocat).
- *
- * Revision 1.32 92/02/17 12:41:55 jonathan
- * Added RCSid.
- *
- * Revision 1.31 92/02/17 12:41:01 jonathan
- * Build catalog after completion of indexing.
- *
- * Revision 1.30 92/02/12 13:22:53 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /* to do:
- * done: make incremental indexing not index things that are already index
- * add extra arg -register that will send in description of the server to
- * the directory of servers.
- * done: create a source struct in the .src file
- * make it continuously index to keep itself uptodate.
- *
- */
-
- #include <string.h>
- #include <sys/types.h>
- #include <sys/param.h>
- #include <sys/stat.h>
- #include "irdirent.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
- #include "panic.h"
- #include "ircfiles.h"
- #include "version.h"
- #include "irext.h"
-
- #define INDEXER_DATE "Sun May 10 1992"
-
- /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */
-
- extern boolean indexingForBeta;
-
- struct file_type {
- char *name;
- char *description;
- char *type;
- boolean (*separator_function)();
- void (*header_function)();
- long (*date_function)();
- void (*finish_header_function)();
- boolean index_contents;
- } file_type_list[] = {
- {"groliers",
- "groliers encyclopedia special format",
- "TEXT", groliers_separator_function, groliers_header_function, 0, groliers_finish_header_function, 1},
- #ifdef NEXT
- {"objc",
- "objective-C .h and .m files",
- "TEXT", qobjc_separator_function, wobj_header_function, 0, wobj_finish_header_function, 1},
- #endif /* NEXT */
- {"mail",
- "mail format",
- "TEXT", mail_or_rmail_separator, mail_header_function, mail_date_function, mail_finish_header_function, 1},
- {"mail_or_rmail",
- "mail or rmail or both",
- "TEXT", mail_or_rmail_separator, mail_header_function, mail_date_function, mail_finish_header_function, 1},
- {"mail_digest",
- "standard internet mail digest format",
- "TEXT", mail_digest_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1},
- {"mh_bboard",
- "MH bulletin board format",
- "TEXT", mh_bboard_separator_function, mail_header_function, 0, mail_finish_header_function, 1},
- {"rmail",
- "rmail format",
- "TEXT", rmail_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1},
- {"netnews",
- "netnews format",
- "TEXT", 0, mail_header_function, mail_date_function, mail_finish_header_function, 1},
- {"rn",
- "netnews saved by the [rt]?rn newsreader",
- "TEXT", rn_separator_function, mail_header_function, mail_date_function, mail_finish_header_function, 1},
- {"emacsinfo",
- "the GNU documentation system",
- "TEXT", emacs_info_separator_function, emacs_info_header_function, 0, emacs_info_finish_header_function, 1},
- {"catalog",
- "??",
- "TEXT", catalog_separator_function, catalog_header_function, 0, catalog_finish_header_function, 1},
- {"bio",
- "biology abstract format",
- "TEXT", bio_separator_function, bio_header_function, 0, bio_finish_header_function, 1},
- {"cmapp",
- "CM applications from Hypercard",
- "TEXT", cmapp_separator_function, cmapp_header_function, 0, cmapp_finish_header_function, 1},
- {"ftp",
- "special type for FTP files. First line of file is headline",
- "TEXT", first_line_separator_function, first_line_header_function, 0, first_line_finish_header_function, 1},
- {"jargon",
- "Jargon File 2.9.8 format",
- "TEXT", jargon_separator_function, jargon_header_function, 0, jargon_finish_header_function, 1},
- {"server",
- "server structures for the dir of servers",
- "WSRC", 0, 0, 0, filename_finish_header_function},
- {"text",
- "simple text files, this is the default",
- "TEXT", 0,0,0,0, 1},
- {"filename",
- "uses only the filename part of the pathname for the title",
- "TEXT", 0,0,0, filename_finish_header_function, 1},
- {"irg",
- "internet resource guide",
- "TEXT", irg_separator_function, irg_header_function, 0, irg_finish_header_function, 1},
- {"dash",
- "entries separated by a row of dashes",
- "TEXT", dash_separator_function, dash_header_function, 0, dash_finish_header_function, 1},
- {"one_line",
- "each line is a document",
- "TEXT", one_line_separator_function, one_line_header_function, 0, one_line_finish_header_function, 1},
- {"para",
- "paragraphs separated by blank lines",
- "TEXT", para_separator_function, para_header_function, 0, para_finish_header_function, 1},
- {"seeker",
- "??",
- "TEXT", seeker_separator_function, seeker_header_function, 0, seeker_finish_header_function, 1},
- {"medline",
- "medline format",
- "TEXT", medline_separator_function, medline_header_function, 0, medline_finish_header_function, 1},
- {"refer",
- "refer format",
- "TEXT", refer_separator_function, refer_header_function, 0, refer_finish_header_function, 1},
- {"first_line",
- "first line of file is headline",
- "TEXT", first_line_separator_function, first_line_header_function, 0, first_line_finish_header_function, 1},
- {"rlin",
- "??",
- "TEXT", rlin_separator_function, rlin_header_function, 0, rlin_finish_header_function, 1},
- {"dvi",
- "dvi format",
- "DVI", 0, 0, 0, filename_finish_header_function, 1},
- {"ps",
- "postscript format ",
- "PS", 0, 0, 0, filename_finish_header_function, 0},
- {"pict",
- "pict files, only indexes the filename",
- "PICT", 0, 0, 0, filename_finish_header_function, 0},
- {"gif",
- "gif files, only indexes the filename",
- "GIF", 0, 0, 0, filename_finish_header_function, 0},
- {"tiff",
- "tiff files, only indexes the filename",
- "TIFF", 0, 0, 0, filename_finish_header_function, 0},
- {"bibtex",
- "BibTeX / LaTeX format",
- "TEXT", bibtex_separator_function, bibtex_header_function, 0, bibtex_finish_header_function, 1},
- {"nhyp",
- "?:? hyper text format, Polytechnic of Central London",
- "TEXT", nhyp_separator_function, nhyp_header_function, 0, nhyp_finish_header_function, 1},
- {"ziff",
- "ziff special format",
- "TEXT", ziff_separator_function, ziff_header_function, 0, ziff_finish_header_function, 1},
- #ifdef /* sco */ M_UNIX
- {"erg_mail_thread",
- "SCO ERG mail thread format",
- "TEXT", erg_thread_separator_function,
- erg_thread_header_function,
- mail_date_function,
- erg_thread_finish_header_function, 1},
- {"mmdf",
- "MMDF mail folder format",
- "TEXT", mmdf_separator_function,
- mail_header_function,
- mail_date_function,
- mail_finish_header_function, 1},
-
- { "change_desc",
- "SCO ERG change descriptions format",
- "TEXT", first_line_separator_function,
- erg_cd_header_function,
- mail_date_function, /* fix this later, can look in the cd */
- erg_cd_finish_header_function, 1},
- #endif
- 0
- };
-
-
- void usage(command)
- char *command;
- {
- /* no args */
- struct file_type *t;
- int first;
-
- fprintf(stderr,"Usage: %s [-d index_filename]\n", command);
- fprintf(stderr," [-a] /* adding to an existing index, otherwise it erases the index */\n");
- fprintf(stderr," [-r] /* recursively index subdirectories */\n");
- fprintf(stderr," [-mem mbytes] /* number of megabytes to run this in */\n");
- fprintf(stderr," [-register] /* registers the database with the directory of servers.\n");
- fprintf(stderr," This should be done with care. */\n");
- fprintf(stderr," [-export] /* uses short dbname and port 210 */\n");
- fprintf(stderr," [-e [file]] /* set log output to file, or /dev/null if not specified */\n");
- fprintf(stderr," [-f [filter]] /* run filter on each file before indexing */\n");
- fprintf(stderr," [-l log_level] /* set log level. 0 means log nothing,\n");
- fprintf(stderr," 10 [the default] means log everything */\n");
- fprintf(stderr," [-v] /* print the version of the software */\n");
- fprintf(stderr," [-stdin] /* read file names from stdin */\n");
- fprintf(stderr," [-pos | -nopos] /* include (don't include - default) word position information /*\n");
- fprintf(stderr," [-nopairs | -pairs] /* don't include (or include - default) word pairs /*\n");
- fprintf(stderr," [-nocat] /* inhibit creation of catalog /*\n");
- fprintf(stderr," [-contents] /* Index the contents: this is good for types that\n");
- fprintf(stderr," inhibit the indexing of the contents (like gif). /*\n");
- fprintf(stderr," [-nocontents] /* Index only the filename, not the contents /*\n");
- fprintf(stderr," [-cmmem mem%] /* percent of CM memory (CM code only) */\n");
- fprintf(stderr," [-T type] /* type becomes the \"TYPE\" of the document. */\n");
- fprintf(stderr," [-t /* format of the file. if none then each file is a document */\n");
- for (t=file_type_list, first=0; t->name; t++, first++) {
- fprintf(stderr," %c %s /* %s */\n", first ? '|' : ' ', t->name, t->description);
- }
- #if 0
- fprintf(stderr," text /* simple text files, this is the default */\n");
- fprintf(stderr," | bibtex /* BibTeX / LaTeX format */\n");
- fprintf(stderr," | bio /* biology abstract format */\n");
- fprintf(stderr," | cmapp /* CM applications from Hypercard */\n");
- fprintf(stderr," | dash /* entries separated by a row of dashes */\n");
- fprintf(stderr," | dvi /* dvi format */\n");
- fprintf(stderr," | emacsinfo /* the GNU documentation system */\n");
- fprintf(stderr," | first_line /* first line of file is headline */\n");
- fprintf(stderr," | filename /* uses only the filename part of the pathname for the title */\n");
- fprintf(stderr," | ftp /* special type for FTP files. First line of file is headline */\n");
- fprintf(stderr," | gif /* gif files, only indexes the filename */\n");
- fprintf(stderr," | irg /* internet resource guide */\n");
- fprintf(stderr," | jargon /* Jargon File 2.9.8 format*/\n");
- fprintf(stderr," | mail_digest /* standard internet mail digest format */\n");
- fprintf(stderr," | mail_or_rmail /* mail or rmail or both */\n");
- fprintf(stderr," | medline /* medline format */\n");
- fprintf(stderr," | mh_bboard /* MH bulletin board format */\n");
- fprintf(stderr," | netnews /* netnews format */\n");
- fprintf(stderr," | nhyp /* ?:? hyper text format, Polytechnic of Central London */\n");
- fprintf(stderr," | one_line /* each line is a document */\n");
- fprintf(stderr," | para /* paragraphs separated by blank lines */\n");
- fprintf(stderr," | pict /* pict files, only indexes the filename */\n");
- fprintf(stderr," | ps /* postscript format */\n");
- fprintf(stderr," | refer /* refer format */\n");
- fprintf(stderr," | rn /* netnews saved by the [rt]?rn newsreader */\n");
- fprintf(stderr," | server /* server structures for the dir of servers */\n");
- #ifdef NeXT
- fprintf(stderr," | objc /* objective-C .h and .m files */\n");
- #endif /* def NeXT */
- fprintf(stderr," | tiff /* tiff files, only indexes the filename */\n");
- #endif /* 0 */
- fprintf(stderr," ] filename filename ...\n");
- }
-
- char *log_file_name = NULL;
- FILE *logfile;
-
- extern boolean index_contents;
- extern boolean filter_contents;
- extern char filter_program[];
-
-
- #define set(a,b) if (b) (a)=(b)
-
- /* This is the MAIN for building an index.
- */
- void
- main(argc, argv)
- int argc;
- char *argv[];
- {
- database* db = NULL;
- long argc_copy = argc;
- char **argv_copy = argv;
- char *next_argument;
- char index_filename[1000];
- boolean (*separator_function)();
- void (*header_function)();
- void (*finish_header_function)();
- long (*date_function)();
- boolean adding_to_existing_index = false;
- boolean traverse_directory = false;
- boolean word_positions = false;
- boolean word_pairs = true;
- long memory_to_use = -1;
- long cm_mem_percent = 0; /* default */
- long grow_percent = 0; /* default */
- long text_size = 0; /* default */
- boolean check_for_text_file = false;
- boolean register_database = false;
- boolean export_database = false;
- boolean read_files_from_stdin = false;
- boolean make_catalog = true;
- char data_filename[MAX_PATH_NAME_LEN];
- char *typename = NULL; /* this is what the user said */
- char type[256];
- /* char *type = NULL; /* this is the type stored with the db */
- long start_of_filenames;
- long hashtable_size = 1L<<16;
- long flush_after_n_words = 300000;
- char *command_name;
- struct file_type *t;
-
- next_argument = next_arg(&argc, &argv);
- separator_function = NULL; /* initailize to nil */
- header_function = NULL;
- date_function = NULL;
- finish_header_function = NULL;
- /* type = "TEXT"; /* default to text */
- strcpy(type, "TEXT");
- typename = "Text";
-
- command_name = next_argument;
-
- logfile = stderr;
- wais_pid = getpid();
-
- if(0 == argc) {
- usage(command_name);
- exit(0);
- }
-
- #ifdef THINK_C
- strcpy(index_filename, "wais:System Folder:wais-index:index");
- #else
- strcpy(index_filename, "index"); /* in the current directory */
- #endif /* THINK_C */
-
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"No arguments specified\n");
- exit(0);
- }
- while((next_argument != NULL) && '-' == next_argument[0]){
- /* then we have an argument to process */
- if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */
- (0 == strcmp("-d", next_argument))){
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"Expected filename for the index\n");
- exit(0);
- }
- strcpy(index_filename, next_argument);
- }
- else if(0 == strcmp("-a", next_argument)){
- adding_to_existing_index = true;
- }
- else if(0 == strcmp("-r", next_argument)){
- traverse_directory = true;
- }
- else if(0 == strcmp("-register", next_argument)){
- register_database = true;
- }
- else if(0 == strcmp("-export", next_argument)){
- export_database = true;
- }
- else if(0 == strcmp("-f", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"Expected filter for -f\n");
- exit(1);
- }
- strcpy(filter_program, next_argument);
- filter_contents = TRUE;
- }
- else if(0 == strcmp("-v", next_argument)){
- fprintf(stderr,"%s: %s\n", command_name, VERSION, INDEXER_DATE);
- }
- else if (0 == strcmp("-stdin", next_argument)) {
- read_files_from_stdin = true;
- }
- else if (0 == strcmp("-nopos", next_argument)) {
- word_positions = false;
- }
- else if (0 == strcmp("-pos", next_argument)) {
- word_positions = true;
- }
- else if (0 == strcmp("-nopairs", next_argument)) {
- word_pairs = false;
- }
- else if (0 == strcmp("-pairs", next_argument)) {
- word_pairs = true;
- }
- else if (0 == strcmp("-nocat", next_argument)) {
- make_catalog = false;
- }
- else if(0 == strcmp("-mem", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number for the amount of memory to use");
- memory_to_use = atol(next_argument);
- if(memory_to_use < 1)
- panic("The -mem argument should not be less than 1");
- if(memory_to_use > 200)
- fprintf(stderr,"Warning: The -mem parameter was %ld Mbytes. That is a large number of mega bytes in current machines\n", memory_to_use);
- }
- else if(0 == strcmp("-cmmem", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number (1-100) for percentage of memory to use");
- cm_mem_percent = atol(next_argument);
- if(cm_mem_percent < 1)
- panic("The -cmmem argument should not be less than 1 and less than 100");
- if(cm_mem_percent > 100)
- panic("Warning: The -cmmem parameter was %ld%%. It should be between 1-100.", cm_mem_percent);
- }
- else if(0 == strcmp("-grow", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number (1-100) for database growing percentage");
- grow_percent = atol(next_argument);
- if(grow_percent < 1)
- panic("The -grow argument should not be less than 1");
- }
- else if(0 == strcmp("-textsize", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number for text size in megabytes");
- text_size = atol(next_argument);
- if(text_size < 1)
- panic("The -textsize argument should not be less than 1");
- }
- else if (0 == strcmp("-e", next_argument)) {
- char *peek_argument = peek_arg(&argc, &argv);
- log_file_name = "/dev/null"; /* default to /dev/null */
- if ((peek_argument != NULL) &&
- ('-' != peek_argument[0])) {
- log_file_name = next_arg(&argc, &argv);
- } /* end if (explicit log file) */
- } /* end if (-e) */
- else if (0 == strcmp("-l", next_argument)) {
- wais_log_level = atol(next_arg(&argc, &argv));
- } /* end if (-l) */
- else if(0 == strcmp("-cm", next_argument)){
- /* this is an undocumented argument to help use this to
- front end the CM application */
- indexingForBeta = true;
- }
- else if(0 == strcmp("-T", next_argument)){
- /* This is a specification for a "Special" type. The next argument
- is the type name. This will not index the body of the file. */
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a file type");
- typename = next_argument;
- /* type = next_argument;*/
- strcpy(type, next_argument);
- finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("-contents", next_argument)){
- index_contents = true;
- }
- else if(0 == strcmp("-nocontents", next_argument)){
- index_contents = false;
- }
- else if(0 == strcmp("-t", next_argument)){
- /* then we have a specialized file */
- index_contents = true;
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a file type");
- for(t = file_type_list; t->name; t++) {
- if (strcmp(t->name, next_argument) == 0) {
- typename = t->name;
- strcpy(type, t->type);
- set(separator_function, t->separator_function);
- set(header_function, t->header_function);
- set(date_function, t->date_function);
- set(finish_header_function, t->finish_header_function);
- set(index_contents, t->index_contents);
- goto found;
- }
- }
- panic("Don't recognize the '%s' type", next_argument);
-
- found:
- ;
-
- #if 0
- if(0 == strcmp("groliers", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = groliers_separator_function;
- header_function = groliers_header_function;
- finish_header_function = groliers_finish_header_function;
- }
- #ifdef NeXT
- else if(0 == strcmp("objc", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = wobjc_separator_function;
- header_function = wobjc_header_function;
- finish_header_function = wobjc_finish_header_function;
- }
- #endif /* def NeXT */
- else if(0 == strcmp("mail", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mail_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("erg_mail_thread", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = erg_thread_separator_function;
- header_function = erg_thread_header_function;
- date_function = mail_date_function;
- finish_header_function = erg_thread_finish_header_function;
- }
- else if(0 == strcmp("mail_or_rmail", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mail_or_rmail_separator;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mail_digest", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mail_digest_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mh_bboard", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = mh_bboard_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("rmail", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = rmail_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("netnews", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = NULL;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("rn", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = rn_separator_function;
- header_function = mail_header_function;
- date_function = mail_date_function;
- finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("emacsinfo", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = emacs_info_separator_function;
- header_function = emacs_info_header_function;
- finish_header_function = emacs_info_finish_header_function;
- }
- else if(0 == strcmp("catalog", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = catalog_separator_function;
- header_function = catalog_header_function;
- finish_header_function = catalog_finish_header_function;
- }
- else if(0 == strcmp("bio", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = bio_separator_function;
- header_function = bio_header_function;
- finish_header_function = bio_finish_header_function;
- }
- else if(0 == strcmp("cmapp", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = cmapp_separator_function;
- header_function = cmapp_header_function;
- finish_header_function = cmapp_finish_header_function;
- }
- else if(0 == strcmp("ftp", next_argument)){
- type = "TEXT-FTP";
- typename = next_argument;
- separator_function = first_line_separator_function;
- header_function = first_line_header_function;
- finish_header_function = first_line_finish_header_function;
- }
- else if(0 == strcmp("jargon", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = jargon_separator_function;
- header_function = jargon_header_function;
- finish_header_function = jargon_finish_header_function;
- }
- else if(0 == strcmp("server", next_argument)){
- typename = next_argument;
- type = "WSRC";
- finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("text", next_argument)){
- type = "TEXT";
- typename = next_argument;
- check_for_text_file = true;
- }
- else if(0 == strcmp("filename", next_argument)){
- type = "TEXT";
- typename = next_argument;
- finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("irg", next_argument)){
- typename = next_argument;
- type = "TEXT";
- separator_function = irg_separator_function;
- header_function = irg_header_function;
- finish_header_function = irg_finish_header_function;
- }
- /* dash-separated items , Intro to Algorithms buglist, etc */
- else if(0 == strcmp("dash", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = dash_separator_function;
- header_function = dash_header_function;
- finish_header_function = dash_finish_header_function;
- }
- /* one_line-separated items */
- else if(0 == strcmp("one_line", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = one_line_separator_function;
- header_function = one_line_header_function;
- finish_header_function = one_line_finish_header_function;
- }
- /* blank line-separated items (paragraphs) */
- else if(0 == strcmp("para", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = para_separator_function;
- header_function = para_header_function;
- finish_header_function = para_finish_header_function;
- }
- /* seeker items */
- else if(0 == strcmp("seeker", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = seeker_separator_function;
- header_function = seeker_header_function;
- finish_header_function = seeker_finish_header_function;
- }
- /* medline format */
- else if(0 == strcmp("medline", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = medline_separator_function;
- header_function = medline_header_function;
- finish_header_function = medline_finish_header_function;
- }
- /* refer format */
- else if(0 == strcmp("refer", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = refer_separator_function;
- header_function = refer_header_function;
- finish_header_function = refer_finish_header_function;
- }
- /* first_line format */
- else if(0 == strcmp("first_line", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = first_line_separator_function;
- header_function = first_line_header_function;
- finish_header_function = first_line_finish_header_function;
- }
- /* rlin items */
- else if(0 == strcmp("rlin", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = rlin_separator_function;
- header_function = rlin_header_function;
- finish_header_function = rlin_finish_header_function;
- }
- else if(0 == strcmp("dvi", next_argument)){
- typename = next_argument;
- type = "DVI";
- finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("ps", next_argument)){
- typename = next_argument;
- type = "PS";
- finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("pict", next_argument)){
- typename = next_argument;
- type = "PICT";
- finish_header_function = filename_finish_header_function;
- index_contents = false;
- }
- else if(0 == strcmp("gif", next_argument)){
- typename = next_argument;
- type = "GIF";
- finish_header_function = filename_finish_header_function;
- index_contents = false;
- }
- else if(0 == strcmp("tiff", next_argument)){
- typename = next_argument;
- type = "TIFF";
- finish_header_function = filename_finish_header_function;
- index_contents = false;
- }
- /* BibTeX items */
- else if(0 == strcmp("bibtex", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = bibtex_separator_function;
- header_function = bibtex_header_function;
- finish_header_function = bibtex_finish_header_function;
- }
- /* ?:? seperated hypertext items */
- else if(0 == strcmp("nhyp", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = nhyp_separator_function;
- header_function = nhyp_header_function;
- finish_header_function = nhyp_finish_header_function;
- }
- else if(0 == strcmp("ziff", next_argument)){
- type = "TEXT";
- typename = next_argument;
- separator_function = ziff_separator_function;
- header_function = ziff_header_function;
- finish_header_function = ziff_finish_header_function;
- }
- else{
- panic("Don't recognize the '%s' type", next_argument);
- }
- #endif /* 0 */
- }
- else{
- panic("Don't recognize the '%s' option", next_argument);
- }
-
- next_argument = next_arg(&argc, &argv);
- if (! (read_files_from_stdin || next_argument)) {
- fprintf(stderr,"No files specified\n");
- exit(0);
- }
- }
- start_of_filenames = argc_copy - argc - 1;
-
- /* check index */
- if(0 == strlen(pathname_name(index_filename))){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "The pathname specified for the destination of the index files ('%s') should have a leaf filename without an extention rather than just a directory.",
- index_filename);
- exit(0);
- }
-
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build database %s",
- index_filename);
-
- if(0 != init_search_engine(index_filename, false, false, cm_mem_percent,
- text_size, grow_percent))
- panic("unable to initialize search engine");
-
- if(true == adding_to_existing_index){
- db = openDatabase(index_filename, false, false);
- if (db == NULL){ /* does not exist, create one */
- db = openDatabase(index_filename, true, false);
- if (db == NULL)
- panic("unable to open the database");
- }
- }
- else{
- db = openDatabase(index_filename, true, false);
- if (db == NULL)
- panic("unable to open the database");
- }
- { /* set up the memory hashtable */
-
- if(memory_to_use < 0){ /* default */
- /* do nothing */
- }
- else if(memory_to_use <= 2){
- hashtable_size = 1L<<16;
- flush_after_n_words = 50000;
- }
- else if(memory_to_use <= 5){
- hashtable_size = 1L<<16;
- flush_after_n_words = 150000;
- }
- else if(memory_to_use <= 10){
- /* shown to take about 6MB on a sun4, when it is dict limited */
- hashtable_size = 1L<<16;
- flush_after_n_words = 300000;
- }
- else if(memory_to_use <= 20){
- hashtable_size = 1L<<17;
- flush_after_n_words = 600000;
- }
- else{ /* over 20 Mbytes */
- hashtable_size = 1L<<18;
- flush_after_n_words = 1000000;
- }
- init_add_word(db, hashtable_size, flush_after_n_words);
- }
-
- if (read_files_from_stdin) {
- if (0 != (next_argument = fgets(data_filename, MAX_PATH_NAME_LEN, stdin))) {
- int len = strlen(next_argument);
- if (next_argument[len-1] == '\n') {
- next_argument[len-1] = '\0';
- }
- }
- }
-
- while(NULL != next_argument){ /* the first filename is in next_argument already */
- if(directoryp(next_argument)){
- if(traverse_directory){
- index_directory(next_argument,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type, db,
- check_for_text_file,
- adding_to_existing_index,
- word_positions, word_pairs);
- }
- }
- else{ /* not a directory */
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Indexing file: %s", next_argument);
- index_text_file(next_argument,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type, db,
- check_for_text_file, adding_to_existing_index,
- word_positions, word_pairs);
- }
- if (read_files_from_stdin) {
- if (0 != (next_argument = fgets(data_filename, MAX_PATH_NAME_LEN, stdin))) {
- int len = strlen(next_argument);
- if (next_argument[len-1] == '\n') {
- next_argument[len-1] = '\0';
- }
- }
- }
- else {
- next_argument = next_arg(&argc, &argv);
- }
- }
- finished_add_word(db);
- {
- char filename[MAX_FILENAME_LEN + 1];
- if(!probe_file(source_filename(filename, db))){
- char database_name[MAX_FILENAME_LEN];
- write_src_structure(source_filename(filename, db),
- export_database?pathname_name(index_filename):
- truename(index_filename, database_name),
- typename,
- &argv_copy[start_of_filenames],
- argc_copy - start_of_filenames,
- export_database,
- 210L);
- }
- /* write out a description of the server if appropriate */
- if(register_database){
- register_src_structure(source_filename(filename, db));
- }
- }
- if(make_catalog) build_catalog(db);
- closeDatabase(db);
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Finished build");
- exit(0);
- }
-